This example uses the tidyverse suite of packages.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
The code chunk below reads in the final project data.
df <- readr::read_csv("paint_project_train_data.csv", col_names = TRUE)
## Rows: 835 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Lightness, Saturation
## dbl (6): R, G, B, Hue, response, outcome
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_scaled <- df %>%
select(R, G, B, Hue, Saturation, Lightness, outcome) %>%
mutate(outcome = ifelse(outcome == 1, 'event', 'non_event'),
outcome = factor(outcome, levels = c('event', 'non_event')))
set.seed(1234)
my_ctrl_acc <- trainControl(method = 'repeatedcv', number = 5, repeats = 3, savePredictions = 'all')
my_metrics_acc <- "Accuracy"
formulas <- list('outcome ~ .',
'outcome ~ Lightness + Saturation + (R + G + B + Hue)^2',
'outcome ~ (Lightness + Saturation)*(R + G + B + Hue)*(R + G + B + Hue)',
'outcome ~ (Lightness + Saturation + Hue)*(R + G + B)')
formula_to_glm_model <- function(form, met, ctrl) {
caret::train(form %>% as.formula(),
data = df_scaled,
method = 'glm',
preProcess = c('center', 'scale'),
metric = met,
trControl = ctrl)
}
glm_default_acc <- tibble::tibble(
formula = formulas,
models = purrr::map(formulas, formula_to_glm_model, my_metrics_acc, my_ctrl_acc)
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
formula_to_enet_model <- function(form, met, ctrl) {
caret::train(form %>% as.formula(),
data = df_scaled,
method = 'glmnet',
preProcess = c('center', 'scale'),
metric = met,
trControl = ctrl)
}
enet_default_acc <- tibble::tibble(
formula = formulas[2:3],
models = purrr::map(formula, formula_to_enet_model, my_metrics_acc, my_ctrl_acc)
)
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
formula_to_enet_tune_model <- function(form, enet_default, met, ctrl) {
my_lambda_grid <- exp(seq(log(min(enet_default$results$lambda)),
log(max(enet_default$results$lambda)),
length.out = 25))
enet_grid <- expand.grid(alpha = seq(0.1, 1, by = 0.1),
lambda = my_lambda_grid)
caret::train(form %>% as.formula(),
data = df_scaled,
method = 'glmnet',
preProcess = c('center', 'scale'),
tuneGrid = enet_grid,
metric = met,
trControl = ctrl)
}
enet_tune_acc <- tibble::tibble(
formula = formulas[2:3],
models = purrr::map2(formula, enet_default_acc$models, formula_to_enet_tune_model, my_metrics_acc, my_ctrl_acc, .progress = TRUE)
)
## â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– â– 50% | ETA: 23s
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
##
plot(enet_tune_acc$models[[1]], xTrans = log)
plot(enet_tune_acc$models[[2]], xTrans = log)
plot( varImp(enet_tune_acc$models[[1]]) )
plot( varImp(enet_tune_acc$models[[2]]), top = 20 )
nnet_default_acc <- train( outcome ~ .,
data = df_scaled,
method = 'nnet',
metric = my_metrics_acc,
preProcess = c("center", "scale"),
trControl = my_ctrl_acc,
trace = FALSE)
nnet_grid <- expand.grid(size = c(5, 10, 20),
decay = exp(seq(-6, 2, length.out=11)))
nnet_tune_acc <- train( outcome ~ .,
data = df_scaled,
method = 'nnet',
metric = my_metrics_acc,
preProcess = c("center", "scale"),
tuneGrid = nnet_grid,
trControl = my_ctrl_acc,
trace = FALSE)
plot(nnet_tune_acc, xTrans=log)
rf_default_acc <- train( outcome ~ .,
data = df_scaled,
method = 'rf',
metric = my_metrics_acc,
trControl = my_ctrl_acc,
importance = TRUE)
plot( varImp(rf_default_acc) )
xgb_default_acc <- caret::train(outcome ~ ., data = df_scaled,
metric = my_metrics_acc,
trControl = my_ctrl_acc,
method = 'xgbTree',
verbosity = 0,
nthread = 1)
plot(xgb_default_acc)
xgb_grid <- expand.grid(nrounds = seq(25, 50, by = 25),
max_depth = c(3,6,9,12),
eta = c(0.25, 0.5, 1) * xgb_default_acc$bestTune$eta,
gamma = xgb_default_acc$bestTune$gamma,
colsample_bytree = xgb_default_acc$bestTune$colsample_bytree,
min_child_weight = xgb_default_acc$bestTune$min_child_weight,
subsample = xgb_default_acc$bestTune$subsample)
xgb_tune_acc <- caret::train(outcome ~ ., data = df_scaled,
metric = my_metrics_acc,
trControl = my_ctrl_acc,
method = 'xgbTree',
verbosity = 0,
tuneGrid = xgb_grid,
nthread = 1)
plot(xgb_tune_acc)
plot( varImp(xgb_default_acc) )
plot( varImp(xgb_tune_acc) )
caret_acc_compare <- resamples(
list(
glm_default_acc_1 = glm_default_acc$models[[1]],
glm_default_acc_2 = glm_default_acc$models[[2]],
glm_default_acc_3 = glm_default_acc$models[[3]],
glm_default_acc_4 = glm_default_acc$models[[4]],
enet_default_acc_1 = enet_default_acc$models[[1]],
enet_default_acc_2 = enet_default_acc$models[[2]],
enet_tune_acc_1 = enet_tune_acc$models[[1]],
enet_tune_acc_2 = enet_tune_acc$models[[2]],
nnet_default_acc = nnet_default_acc,
nnet_tune_acc = nnet_tune_acc,
rf_default_acc = rf_default_acc,
xgb_default_acc = xgb_default_acc,
xgb_tune_acc = xgb_tune_acc
)
)
caret_acc_compare %>% dotplot(metric = my_metrics_acc)
acc_models_list <- list(
glm_default_acc_1 = glm_default_acc$models[[1]],
glm_default_acc_2 = glm_default_acc$models[[2]],
glm_default_acc_3 = glm_default_acc$models[[3]],
glm_default_acc_4 = glm_default_acc$models[[4]],
enet_default_acc_1 = enet_default_acc$models[[1]],
enet_default_acc_2 = enet_default_acc$models[[2]],
enet_tune_acc_1 = enet_tune_acc$models[[1]],
enet_tune_acc_2 = enet_tune_acc$models[[2]],
nnet_default_acc = nnet_default_acc,
nnet_tune_acc = nnet_tune_acc,
rf_default_acc = rf_default_acc,
xgb_default_acc = xgb_default_acc,
xgb_tune_acc = xgb_tune_acc
)
for(name in names(acc_models_list)) {
acc_models_list[[name]] %>% readr::write_rds(paste(name, '.rds', sep = ''))
}
my_ctrl_roc <- trainControl(method = 'repeatedcv', number = 5, repeats = 3,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
my_metrics_roc <- 'ROC'
glm_default_roc <- tibble::tibble(
formula = formulas,
models = purrr::map(formulas, formula_to_glm_model, my_metrics_roc, my_ctrl_roc)
)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
enet_default_roc <- tibble::tibble(
formula = formulas[2:3],
models = purrr::map(formula, formula_to_enet_model, my_metrics_roc, my_ctrl_roc)
)
enet_tune_roc <- tibble::tibble(
formula = formulas[2:3],
models = purrr::map2(formula, enet_default_roc$models, formula_to_enet_tune_model, my_metrics_roc, my_ctrl_roc)
)
## Warning: from glmnet C++ code (error code -100); Convergence for 100th lambda
## value not reached after maxit=100000 iterations; solutions for larger lambdas
## returned
plot(enet_tune_roc$models[[1]], xTrans = log)
plot(enet_tune_roc$models[[2]], xTrans = log)
plot(varImp(enet_tune_roc$models[[1]]))
plot(varImp(enet_tune_roc$models[[2]]), top = 20)
nnet_default_roc <- train( outcome ~ .,
data = df_scaled,
method = 'nnet',
metric = my_metrics_roc,
preProcess = c("center", "scale"),
trControl = my_ctrl_roc,
trace = FALSE)
nnet_grid <- expand.grid(size = c(5, 10, 20),
decay = exp(seq(-6, 2, length.out=11)))
nnet_tune_roc <- train( outcome ~ .,
data = df_scaled,
method = 'nnet',
metric = my_metrics_roc,
preProcess = c("center", "scale"),
tuneGrid = nnet_grid,
trControl = my_ctrl_roc,
trace = FALSE)
plot(nnet_tune_roc, xTrans=log)
rf_default_roc <- train( outcome ~ .,
data = df_scaled,
method = 'rf',
metric = my_metrics_roc,
trControl = my_ctrl_roc,
importance = TRUE)
plot( varImp(rf_default_roc) )
xgb_default_roc <- caret::train(outcome ~ ., data = df_scaled,
metric = my_metrics_roc,
trControl = my_ctrl_roc,
method = 'xgbTree',
verbosity = 0,
nthread = 1)
plot(xgb_default_roc)
xgb_grid <- expand.grid(nrounds = seq(25, 50, by = 25),
max_depth = c(3,6,9,12),
eta = c(0.25, 0.5, 1) * xgb_default_roc$bestTune$eta,
gamma = xgb_default_roc$bestTune$gamma,
colsample_bytree = xgb_default_roc$bestTune$colsample_bytree,
min_child_weight = xgb_default_roc$bestTune$min_child_weight,
subsample = xgb_default_roc$bestTune$subsample)
xgb_tune_roc <- caret::train(outcome ~ ., data = df_scaled,
metric = my_metrics_roc,
trControl = my_ctrl_roc,
method = 'xgbTree',
verbosity = 0,
tuneGrid = xgb_grid,
nthread = 1)
plot(xgb_tune_roc)
plot( varImp(xgb_default_roc) )
plot( varImp(xgb_tune_roc) )
caret_roc_compare <- resamples(
list(
glm_default_roc_1 = glm_default_roc$models[[1]],
glm_default_roc_2 = glm_default_roc$models[[2]],
glm_default_roc_3 = glm_default_roc$models[[3]],
glm_default_roc_4 = glm_default_roc$models[[4]],
enet_default_roc_1 = enet_default_roc$models[[1]],
enet_default_roc_2 = enet_default_roc$models[[2]],
enet_tune_roc_1 = enet_tune_roc$models[[1]],
enet_tune_roc_2 = enet_tune_roc$models[[2]],
nnet_default_roc = nnet_default_roc,
nnet_tune_roc = nnet_tune_roc,
rf_default_roc = rf_default_roc,
xgb_default_roc = xgb_default_roc,
xgb_tune_roc = xgb_tune_roc
)
)
caret_roc_compare %>% dotplot(metric = my_metrics_roc)
roc_models_list <- list(
glm_default_roc_1 = glm_default_roc$models[[1]],
glm_default_roc_2 = glm_default_roc$models[[2]],
glm_default_roc_3 = glm_default_roc$models[[3]],
glm_default_roc_4 = glm_default_roc$models[[4]],
enet_default_roc_1 = enet_default_roc$models[[1]],
enet_default_roc_2 = enet_default_roc$models[[2]],
enet_tune_roc_1 = enet_tune_roc$models[[1]],
enet_tune_roc_2 = enet_tune_roc$models[[2]],
nnet_default_roc = nnet_default_roc,
nnet_tune_roc = nnet_tune_roc,
rf_default_roc = rf_default_roc,
xgb_default_roc = xgb_default_roc,
xgb_tune_roc = xgb_tune_roc
)
for(name in names(roc_models_list)) {
roc_models_list[[name]] %>% readr::write_rds(paste(name, '.rds', sep = ''))
}